Purpose

  1. Build a weighted and directed network of genes based on the Dual RNA seq data generated from the timecourse of Phytophthora cinnamomi and Lupinus angustifolius
  2. Determine modules (clusters of genes) from the network associated to disease progression (for example, biotrophic or necrotrophic stages)
  3. Extract the names of the genes from these hubs and compare them to the genes in the literature
  4. Functionally annotate these genes

Working idea: This network will then be used to identify hub genes for each of the early, middle and late interactions.

Load Libraries

Load Phytophthora cinnamomi gtf file, clean up to get Gene IDs

Combine alternatively spliced transcripts (seen in transcript_id as -RA and RB)

pc.gtf <- rtracklayer::import("https://ftp.ncbi.nlm.nih.gov/genomes/genbank/protozoa/Phytophthora_cinnamomi/latest_assembly_versions/GCA_018691715.1_ASM1869171v1/GCA_018691715.1_ASM1869171v1_genomic.gtf.gz")
pc.df <- as.data.frame((pc.gtf))

pc.genes <- filter(pc.df, type == "start_codon")
pc.geneids <- select(pc.genes, gene_id, product, protein_id)  

pc.annotations <- pc.geneids %>% 
  distinct(gene_id, .keep_all = T)
nrow(pc.annotations)
## [1] 19969
pcannot <- as.matrix(pc.annotations)

#write.table(pcannot, file = "pcgeneids.tsv",quote=FALSE,sep="\t")

Load GO terms blasted from Omicsbox

goterms <- read.delim('pc.goterms.txt', header = TRUE) %>%
  select(c(SeqName, GO.IDs, GO.Names, Description))
  
head(goterms)
##        SeqName                     GO.IDs
## 1 KAG6572551.1               F:GO:0005515
## 2 KAG6572580.1               F:GO:0005515
## 3 KAG6572643.1 P:GO:0015074; F:GO:0003676
## 4 KAG6572667.1               F:GO:0005515
## 5 KAG6572730.1               F:GO:0005515
## 6 KAG6572776.1               F:GO:0005515
##                                    GO.Names
## 1                         F:protein binding
## 2                         F:protein binding
## 3 P:DNA integration; F:nucleic acid binding
## 4                         F:protein binding
## 5                         F:protein binding
## 6                         F:protein binding
##                              Description
## 1       WD repeat-containing protein mio
## 2           CCT motif-containing protein
## 3                  reverse transcriptase
## 4       hypothetical protein IUM83_19800
## 5       hypothetical protein IUM83_18690
## 6 putative PDZ-domain-containing protein
colnames(goterms) <- c("protein_id", "GO_terms", "GO_descripton", "description")

goterms.df <- dplyr::right_join(goterms, pc.annotations, by = 'protein_id')
nrow(goterms.df)
## [1] 19969

Make GO term list for enrichment analysis

splitgt.df <- as.tibble(goterms.df) %>%
  separate_longer_delim(c(GO_terms, GO_descripton), delim = ";")
## Warning: `as.tibble()` was deprecated in tibble 2.0.0.
## ℹ Please use `as_tibble()` instead.
## ℹ The signature and semantics have changed, see `?as_tibble`.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
splitgt.df <- as.data.frame(splitgt.df)

#strip leading spaces
splitgt.df$GO_terms <- gsub(" ","",splitgt.df$GO_terms)
splitgt.df$GO_descripton <- gsub("^ ","",splitgt.df$GO_descripton)
head(splitgt.df, 10)
##      protein_id     GO_terms          GO_descripton
## 1  KAG6572551.1 F:GO:0005515      F:protein binding
## 2  KAG6572580.1 F:GO:0005515      F:protein binding
## 3  KAG6572643.1 P:GO:0015074      P:DNA integration
## 4  KAG6572643.1 F:GO:0003676 F:nucleic acid binding
## 5  KAG6572667.1 F:GO:0005515      F:protein binding
## 6  KAG6572730.1 F:GO:0005515      F:protein binding
## 7  KAG6572776.1 F:GO:0005515      F:protein binding
## 8  KAG6572794.1 F:GO:0005515      F:protein binding
## 9  KAG6572837.1 F:GO:0005515      F:protein binding
## 10 KAG6572861.1 F:GO:0005515      F:protein binding
##                                  description     gene_id
## 1           WD repeat-containing protein mio IUM83_18981
## 2               CCT motif-containing protein IUM83_17625
## 3                      reverse transcriptase IUM83_17651
## 4                      reverse transcriptase IUM83_17651
## 5           hypothetical protein IUM83_19800 IUM83_19800
## 6           hypothetical protein IUM83_18690 IUM83_18690
## 7     putative PDZ-domain-containing protein IUM83_15555
## 8  Ankyrin repeats domain-containing protein IUM83_15538
## 9         putative autophagy-related protein IUM83_15545
## 10                                 WW domain IUM83_15509
##                                   product
## 1        WD repeat-containing protein mio
## 2                              CCT domain
## 3                   reverse transcriptase
## 4                   reverse transcriptase
## 5                    hypothetical protein
## 6                    hypothetical protein
## 7  putative PDZ-domain-containing protein
## 8                    hypothetical protein
## 9      putative autophagy-related protein
## 10                              WW domain
gu <- unique(splitgt.df$GO_terms)
head(gu,20)
##  [1] "F:GO:0005515" "P:GO:0015074" "F:GO:0003676" "F:GO:0008270" "P:GO:0006355"
##  [6] "F:GO:0003700" "F:GO:0005509" "P:GO:0036297" "C:GO:0043240" "P:GO:0006629"
## [11] "F:GO:0020037" "P:GO:0005975" "P:GO:0006508" "F:GO:0030248" "C:GO:0005576"
## [16] "P:GO:0006801" "F:GO:0046872" "F:GO:0005525" "P:GO:0007076" "C:GO:0016020"
gul <- lapply(1:length(gu), function(i){
  mygo <- gu[i]
  unique(splitgt.df[splitgt.df$GO_terms == mygo, "gene_id"])
})

names(gul) <- lapply(1:length(gu), function(i){
  mygo <- gu[i]
  desc <- head(splitgt.df[splitgt.df$GO_terms == mygo, "GO_descripton"],1)
  id_desc <- paste(mygo,desc)
})

head(names(gul))
## [1] "F:GO:0005515 F:protein binding"                          
## [2] "P:GO:0015074 P:DNA integration"                          
## [3] "F:GO:0003676 F:nucleic acid binding"                     
## [4] "F:GO:0008270 F:zinc ion binding"                         
## [5] "P:GO:0006355 P:regulation of DNA-templated transcription"
## [6] "F:GO:0003700 F:DNA-binding transcription factor activity"
# Find the element within the list named "X"
matching_element <- gul$X

# Display the matching element
matching_element
## NULL

Assemble count matrix and coldata file

##                 BS01 BS02 BS03 BS04 BS05 BS06 BS07 BS08 BS09 BS10 BS11 BS12
## ENSRNA049758817    0    0    0    0    0    0    0    0    0    0    0    0
## ENSRNA049758844    0    0    0    0    0    0    0    0    0    0    0    0
## ENSRNA049758846    0    0    0    0    0    0    0    0    0    0    0    1
## ENSRNA049758848    0    0    0    1    0    0    0    0    0    0    0    0
## ENSRNA049758851    3    1    1    1    2    1    0    0    2    2    2    3
## ENSRNA049758853    0    0    0    0    0    0    0    0    0    0    0    0
##                 BS13 BS14 BS15 BS16 BS17 BS18 BS19 BS20 BS21 BS22 BS23 BS24
## ENSRNA049758817    0    0    0    0    0    0    0    0    0    0    0    0
## ENSRNA049758844    0    0    0    0    0    0    0    0    0    0    0    0
## ENSRNA049758846    0    0    0    0    1    0    0    0    0    0    0    0
## ENSRNA049758848    0    0    0    0    0    0    0    0    0    0    0    0
## ENSRNA049758851    0    0    0    0    0    2    0    0    1    0    1    0
## ENSRNA049758853    0    0    0    0    0    0    0    0    0    0    0    0

Import files into r

coldata <- read.table('sample_info.tsv')
Pcgenenames <- read.table('pcgeneids.tsv', row.names = 1, quote = "", sep='\t', fill = TRUE, header = TRUE)
IUM83Tanjilcountmatrix <- read.table('countmatrix.tsv') 

Clean countmatrix for P. cinnamomi analysis

collabels <- colnames(IUM83Tanjilcountmatrix) <- (coldata$Sample)
colnames(IUM83Tanjilcountmatrix) <- collabels

IUM83Tanjilcountmatrix <- tibble::rownames_to_column(IUM83Tanjilcountmatrix, "gene_id")

IUM83CountMatrix <- IUM83Tanjilcountmatrix %>% 
  filter(str_detect(gene_id, "IUM83")) 
  
rownames(IUM83CountMatrix) <- IUM83CountMatrix$gene_id
IUM83CountMatrix <- IUM83CountMatrix [ ,-1]
IUM83CountMatrix <- as.data.frame(IUM83CountMatrix)
IUM83CountMatrix <- IUM83CountMatrix[ ,-(1:12)]

head(IUM83CountMatrix)
##             PcHyp_1 PcHyp_2 PcHyp_3 LaPc18_1 LaPc18_2 LaPc18_3 LaPc30_1
## IUM83_00001       0       0       0        0        0        0        0
## IUM83_00002       0       0       0        0        0        0        0
## IUM83_00003       0       0       0        0        0        0        0
## IUM83_00004       0       0       0        0        0        0        0
## IUM83_00005       0       0       0        0        0        0        0
## IUM83_00006       1       0       0        0        0        0        0
##             LaPc30_2 LaPc30_3 LaPc48_1 LaPc48_2 LaPc48_3
## IUM83_00001        0        0        0        0        0
## IUM83_00002        0        0        0        0        0
## IUM83_00003        0        0        0        0        0
## IUM83_00004        0        0        0        0        0
## IUM83_00005        0        0        0        0        0
## IUM83_00006        0        0        0        1        2
#write.table(IUM83CountMatrix, file = "Pc_countmatrix.tsv",quote=FALSE,sep="\t")

Clean coldata for Deseq2 normalisation

colData <- coldata %>% 
  filter(str_detect(Sample, "Pc"))

rownames(colData) <- colData$Sample
colData <- colData [ ,-1] 

Detect and remove outlier genes

#Call a function from WGCNA package that detects outliers 
#Rows should be the samples and the columns genes
gsg <- goodSamplesGenes(t(IUM83CountMatrix))
##  Flagging genes and samples with too many missing values...
##   ..step 1
##   ..step 2
summary(gsg)
##             Length Class  Mode   
## goodGenes   19981  -none- logical
## goodSamples    12  -none- logical
## allOK           1  -none- logical
#If false, then there are outliers
gsg$allOK #False
## [1] FALSE
table(gsg$goodGenes) #3012to be excluded
## 
## FALSE  TRUE 
##  3012 16969
table(gsg$goodSamples) #All 12 samples passed
## 
## TRUE 
##   12
# remove genes that are detectd as outliers
data <- IUM83CountMatrix[gsg$goodGenes == TRUE,]

Detect outlier samples

# detect outlier samples - hierarchical clustering
htree <- hclust(dist(t(data)), method = "average")
groups <- cutree(htree, k=2) # cut tree into clusters

plot(htree, labels(groups))
# draw dendogram with red borders around the clusters
rect.hclust(htree, k=2, border="red")

# Normalisation using DESeq2 package

The WGCNA library requires normalisation using the vst (variance-stabilising transform) function from the DESeq2 package

# exclude outlier samples from the column data to match the new data
#colData <- colData %>% 
#  filter(!row.names(.) %in% outliers)

# making sure the rownames and column names identical for the two data sets
#all(rownames(colData) == colnames(clusters))

# create DESeq Data Set
dds <- DESeqDataSetFromMatrix(countData = data,
                              colData = colData,
                              design = ~ 1) # no model, for normalisation only

# remove all genes with counts < 10 in more than 90% of samples (12*0.9=10.8 ~ 11) as suggested by WGCNA on RNAseq FAQ (https://horvath.genetics.ucla.edu/html/CoexpressionNetwork/Rpackages/WGCNA/faq.html)
# <10 in more than 90% samples
dds90 <- dds[rowSums(counts(dds) >= 10) >= 11,]
nrow(dds90) # 5403 genes
## [1] 5403
# >= 10) >= 11,] 5403 genes <===== #less than 10 counts in %90 of samples
# >= 15) >= 11,] 4420 genes 

gene.list <- row.names(dds90)
#lapply(gene.list, write, "all_genes.txt", append=TRUE)

# perform variance stabilization
dds_norm <- vst(dds90)
## -- note: fitType='parametric', but the dispersion trend was not well captured by the
##    function: y = a/x + b, and a local regression fit was automatically substituted.
##    specify fitType='local' or 'mean' to avoid this message next time.
# get normalized counts
norm.counts <- assay(dds_norm) %>% 
  t()

your_dds <- estimateSizeFactors(dds90)
your_dds <- estimateDispersions(your_dds)
## gene-wise dispersion estimates
## mean-dispersion relationship
## -- note: fitType='parametric', but the dispersion trend was not well captured by the
##    function: y = a/x + b, and a local regression fit was automatically substituted.
##    specify fitType='local' or 'mean' to avoid this message next time.
## final dispersion estimates
# Plot dispersion estimates and fits
plotDispEsts(your_dds, main = "Dispersion Trend with Local and Parametric Fits")

#Save normalised dataset for faster analysis next time
saveRDS(norm.counts, "norm.counts.rds")

Network Construction

# load norm.counts object
#norm.counts <- readRDS("norm.counts")

# Choose a set of soft-thresholding powers to create a scale-free network
power <- c(c(1:10), seq(from = 12, to = 30, by = 2))

# Call the network topology analysis function
sft <- pickSoftThreshold(norm.counts,
                  powerVector = power,
                  networkType = "signed",
                  RsquaredCut = 0.90,
                  verbose = 5)
## pickSoftThreshold: will use block size 5403.
##  pickSoftThreshold: calculating connectivity for given powers...
##    ..working on genes 1 through 5403 of 5403
## Warning: executing %dopar% sequentially: no parallel backend registered
##    Power SFT.R.sq   slope truncated.R.sq mean.k. median.k. max.k.
## 1      1   0.0299  7.3100        0.98700    2700    2700.0   2770
## 2      2   0.8450  4.3300        0.80100    1850    1900.0   2120
## 3      3   0.8230  2.0600        0.77600    1430    1500.0   1840
## 4      4   0.8140  1.1900        0.76600    1170    1240.0   1660
## 5      5   0.7740  0.7420        0.72300     985    1050.0   1520
## 6      6   0.6450  0.4540        0.57500     851     901.0   1420
## 7      7   0.4710  0.2520        0.37300     747     783.0   1330
## 8      8   0.1560  0.0909       -0.00455     664     686.0   1260
## 9      9   0.0318 -0.0337       -0.14200     596     605.0   1190
## 10    10   0.4220 -0.1390        0.32500     539     537.0   1140
## 11    12   0.7020 -0.2950        0.67700     450     430.0   1040
## 12    14   0.8330 -0.4070        0.83100     384     352.0    963
## 13    16   0.8950 -0.4930        0.90700     332     290.0    897
## 14    18   0.9060 -0.5680        0.91700     291     242.0    840
## 15    20   0.9010 -0.6250        0.90800     257     204.0    791
## 16    22   0.9120 -0.6780        0.92600     230     174.0    747
## 17    24   0.9380 -0.7200        0.95600     206     149.0    709
## 18    26   0.9370 -0.7550        0.95400     187     128.0    674
## 19    28   0.9340 -0.7900        0.94700     170     111.0    643
## 20    30   0.9340 -0.8210        0.95000     155      96.6    614
sft.data <- sft$fitIndices

a1 <- ggplot(sft.data, aes(Power, SFT.R.sq, label = Power)) +
  geom_point() +
  geom_text(nudge_y = 0.1) +
  geom_hline(yintercept = 0.90, color = 'red') +
  labs(x = 'Power',
       y = 'Scale free topology model fit, signed R^2',
       title = 'Scale independence') +
  theme_classic()

a2 <- ggplot(sft.data, aes(Power, mean.k., label = Power)) +
  geom_point() +
  geom_text(nudge_y = 0.1) +
  labs(x = 'Power',
       y = 'Mean Connectivity',
       title = 'Mean connectivity') +
  theme_classic()
  
grid.arrange(a1, a2, nrow = 2) 

soft_power <- sft$powerEstimate #soft_power <- 18

Create network and identify modules

Note: this chunk is set not to run since the blockwise Modules function takes a long time to run. The R object has been saved and is loaded in the next chunk for a faster run time.

# load bwnet object
bwnet <- readRDS("bwnet.rds")

module_eigengenes <- bwnet$MEs %>%
  orderMEs(.)

head(module_eigengenes)
##                MEred MEgreenyellow   MEpurple MElightcyan     MEblue
## PcHyp_1  -0.47101618   -0.39629891 -0.3471852 -0.29931605 -0.4939114
## PcHyp_2  -0.46906481   -0.39987987 -0.3501685 -0.30343391 -0.4781448
## PcHyp_3  -0.46011364   -0.42693672 -0.3415601 -0.25881032 -0.4846273
## LaPc18_1  0.03517933   -0.03152391 -0.1698264  0.37517480  0.2020603
## LaPc18_2  0.07080316   -0.14832273 -0.1236591  0.33844708  0.2552885
## LaPc18_3 -0.09362904   -0.06435771 -0.3406270 -0.07896645  0.2596966
##          MEturquoise     MEgreen    MEbrown   MEyellow      MEpink       MEtan
## PcHyp_1   -0.3856656  0.35237712  0.2502894  0.1294777 -0.03821415 -0.08959661
## PcHyp_2   -0.3781293  0.34358147  0.2337171  0.1570362 -0.04659362 -0.08365339
## PcHyp_3   -0.3505762  0.35324187  0.2211913  0.1123597 -0.03424420 -0.11064913
## LaPc18_1   0.3608623 -0.06791258 -0.3339015 -0.4597694 -0.37617450 -0.18872178
## LaPc18_2   0.3837549 -0.07259095 -0.3876025 -0.4509801 -0.42270461 -0.19930034
## LaPc18_3   0.3880565 -0.76988911 -0.6242172 -0.3038920 -0.48482054 -0.74507581
##          MEmagenta MEmidnightblue    MEsalmon     MEblack      MEcyan
## PcHyp_1  0.2108928      0.2975619  0.26834081  0.41223138  0.29552268
## PcHyp_2  0.2299597      0.3444562  0.25841789  0.42231263  0.25600278
## PcHyp_3  0.2421529      0.3360986  0.27888460  0.40529854  0.24959661
## LaPc18_1 0.1219476     -0.2169757 -0.69699921 -0.33192073 -0.70713697
## LaPc18_2 0.1772510     -0.1149137 -0.03107396 -0.28538077 -0.39688700
## LaPc18_3 0.5129250      0.5796245  0.42898216  0.06976138  0.07427833
##              MEgrey
## PcHyp_1   0.1077323
## PcHyp_2   0.1019971
## PcHyp_3   0.1071106
## LaPc18_1  0.3519742
## LaPc18_2  0.2891394
## LaPc18_3 -0.8189063
# Plot the dendrogram and the module colors before and after merging underneath
plotDendroAndColors(bwnet$dendrograms[[1]], cbind(bwnet$unmergedColors, bwnet$colors),
                    c("unmerged", "merged"),
                    dendroLabels = FALSE,
                    addGuide = TRUE,
                    hang= 0.03,
                    guideHang = 0.05)

 # grey module = all genes that doesn't fall into other modules
 
module.total <- table(bwnet$colors)

Relate the modules to stages

#create traits file - assign 1 if a sample is a certain stage, else assign 0
Dis_traits <- colData %>% 
  mutate(Dis.vs.all = ifelse(grepl('Treated', Treatment), 1, 0)) %>% 
  select(5)


factor_levels <- unique(colData$Stage)

# transform stages into factors and define levels
colData$Stage <- factor(colData$Stage, levels = factor_levels)

traits <- binarizeCategoricalColumns(colData$Stage,
                           #dropFirstLevelVsAll = FALSE,
                           includePairwise = FALSE,
                           includeLevelVsAll = TRUE,
                           minCount = 1)

traits <- cbind(Dis_traits, traits)
# Define numbers of genes and samples
nSamples <- nrow(norm.counts)
nGenes <- ncol(norm.counts)

# correlation for module eigengenes and traits
module.trait.corr <- cor(module_eigengenes, traits, use = 'p')
module.trait.corr.pvals <- corPvalueStudent(module.trait.corr, nSamples)

# Heat map v2 (from WGCNA tutorial)
textMatrix =  paste(signif(module.trait.corr, 2), "\n(",
                    signif(module.trait.corr.pvals, 1), ")", sep = "")
dim(textMatrix) = dim(module.trait.corr)



par(mar = c(6, 8.5, 3, 3));
labeledHeatmap(Matrix = module.trait.corr,
               xLabels = colnames(module.trait.corr),
               yLabels = rownames(module.trait.corr),
               ySymbols = rownames(module.trait.corr),
               colorLabels = FALSE,
               colors = blueWhiteRed(50),
               textMatrix = textMatrix,
               textAdj = c(0.5, 0.5),
               setStdMargins = FALSE,
               cex.lab.y = 0.6,
               cex.text = 0.7,
               zlim = c(-1,1),
               main = paste("Module-trait relationships"))

# get number of genes for each module
table(bwnet$colors)
## 
##        black         blue        brown         cyan        green  greenyellow 
##          243          881          524           57          262           95 
##         grey    lightcyan      magenta midnightblue         pink       purple 
##          428           40          132           47          201           98 
##          red       salmon          tan    turquoise       yellow 
##          249           65           85         1681          315
#Tag genes with module membership and store it in a table
module.gene.mapping <- as.data.frame(bwnet$colors)

Significance in brackets shows how significantly associated the module (cluster of genes) is the trait of interest
Find modules that have significant association with disease state

Calculate the module membership and the associated p-values The module membership/intramodular connectivity is calculated as the correlation of the eigengene and the gene expression profile. This quantifies the similarity of all genes on the array to every module.

Using the gene significance you can identify genes that have a high significance for trait of interest Using the module membership measures you can identify genes with high module membership in interesting modules.

Gene Significance at the Early Pc-lupin interaction

# Define a gene significance variable for Early
GS.Early <- as.numeric(cor(norm.counts, traits$data.Early.vs.all, use = "p"))

# This translates the numeric values into colors
GS.stage_earlyColor = numbers2colors(GS.Early, signed = T)
blocknumber = 1

moduleLabelsAutomatic = bwnet$colors
# Convert labels to colors for plotting
moduleColorsAutomatic = labels2colors(moduleLabelsAutomatic)

datColors = data.frame(bwnet$colors, GS.stage_earlyColor)[bwnet$blockGenes[[blocknumber]], 
    ]

# Plot the dendrogram and the module colors underneath
plotDendroAndColors(bwnet$dendrograms[[blocknumber]], colors = datColors, groupLabels = c("Module colors", 
    "GS.stage.Early"), dendroLabels = FALSE, hang = 0.03, addGuide = TRUE, guideHang = 0.05)

datKME <- signedKME(norm.counts, module_eigengenes)

# Measure the correlation of the gene's module membership and the associated p-values
module.membership.measure <- cor(module_eigengenes, norm.counts, use = 'p')
module.membership.measure.pvals <- corPvalueStudent(module.membership.measure, nSamples)

# Calculate the gene significance and associated p-values
early.gene.signf.corr <- cor(norm.counts, traits$data.Early.vs.all, use = 'p')
early.gene.signf.corr.pvals <- corPvalueStudent(early.gene.signf.corr, nSamples)


colorOfColumn = substring(names(datKME), 4)
#selectModules = c("brown", "magenta", "blue", "turquoise", "lightcyan")
selectModules = colorOfColumn[colorOfColumn !="grey"]

#par(mfrow = c(4, length(selectModules)/4))
  for (module in selectModules) {
      column = match(module, colorOfColumn)
      restModule = moduleColorsAutomatic == module
      verboseScatterplot(datKME[restModule, column], GS.Early[restModule], xlab = paste("Module Membership", 
          module, "module"), ylab = "GS.stage_Early", main = paste("kME.", module, 
          "vs. GS"), col = module)
  }

earlygs <- early.gene.signf.corr %>% 
  as.data.frame() %>%
  tibble::rownames_to_column("gene_id") 

earlygsid <- dplyr::right_join(pc.annotations, earlygs, by = 'gene_id')

datKme.id <- as.data.frame(datKME) %>%
  rownames_to_column("gene_id") 

earlyeiginengene <- dplyr::right_join(earlygsid, datKme.id, by = 'gene_id')

Gene Significance at the Mid Pc-lupin interaction

# Define a gene significance variable for Mid
GS.mid <- as.numeric(cor(norm.counts, traits$data.Middle.vs.all, use = "p"))

# This translates the numeric values into colors
GS.stage_midColor = numbers2colors(GS.mid, signed = T)
blocknumber = 1

moduleLabelsAutomatic = bwnet$colors
# Convert labels to colors for plotting
moduleColorsAutomatic = labels2colors(moduleLabelsAutomatic)

datColors = data.frame(bwnet$colors, GS.stage_midColor)[bwnet$blockGenes[[blocknumber]], 
    ]

# Plot the dendrogram and the module colors underneath
plotDendroAndColors(bwnet$dendrograms[[blocknumber]], colors = datColors, groupLabels = c("Module colors", 
    "GS.mid"), dendroLabels = FALSE, hang = 0.03, addGuide = TRUE, guideHang = 0.05)

datKME <- signedKME(norm.counts, module_eigengenes)

# Measure the correlation of the gene's module membership and the associated p-values
module.membership.measure <- cor(module_eigengenes, norm.counts, use = 'p')
module.membership.measure.pvals <- corPvalueStudent(module.membership.measure, nSamples)

# Calculate the gene significance and associated p-values
mid.gene.signf.corr <- cor(norm.counts, traits$data.Middle.vs.all, use = 'p')
mid.gene.signf.corr.pvals <- corPvalueStudent(mid.gene.signf.corr, nSamples)


colorOfColumn = substring(names(datKME), 4)
#selectModules = c("red", "tan", "salmon","green", "magenta")
selectModules = colorOfColumn[colorOfColumn !="grey"]

#par(mfrow = c(3, length(selectModules)/3))
  for (module in selectModules) {
      column = match(module, colorOfColumn)
      restModule = moduleColorsAutomatic == module
      verboseScatterplot(datKME[restModule, column], GS.mid[restModule], xlab = paste("Module Membership", 
          module, "module"), ylab = "GS.stage_mid", main = paste("kME.", module, 
          "vs. GS"), col = module)
  }

midgs <- mid.gene.signf.corr %>% 
  as.data.frame() %>%
  tibble::rownames_to_column("gene_id") 

midgsid <- dplyr::right_join(Pcgenenames, midgs, by = 'gene_id')

datKme.id <- as.data.frame(datKME) %>%
  rownames_to_column("gene_id") 

mideiginengene <- dplyr::right_join(midgsid, datKme.id, by = 'gene_id')

Gene Significance at the Late Pc-lupin interaction

# Define a gene significance variable for Late interaction
GS.late <- as.numeric(cor(norm.counts, traits$data.Late.vs.all, use = "p"))

# This translates the numeric values into colors
GS.stage_lateColor = numbers2colors(GS.late, signed = T)
blocknumber = 1

moduleLabelsAutomatic = bwnet$colors
# Convert labels to colors for plotting
moduleColorsAutomatic = labels2colors(moduleLabelsAutomatic)

datColors = data.frame(bwnet$colors, GS.stage_lateColor)[bwnet$blockGenes[[blocknumber]], 
    ]

# Plot the dendrogram and the module colors underneath
plotDendroAndColors(bwnet$dendrograms[[blocknumber]], colors = datColors, groupLabels = c("Module colors", 
    "GS.stage.Late"), dendroLabels = FALSE, hang = 0.03, addGuide = TRUE, guideHang = 0.05)

datKME <- signedKME(norm.counts, module_eigengenes)

# Measure the correlation of the gene's module membership and the associated p-values
module.membership.measure <- cor(module_eigengenes, norm.counts, use = 'p')
module.membership.measure.pvals <- corPvalueStudent(module.membership.measure, nSamples)

# Calculate the gene significance and associated p-values
late.gene.signf.corr <- cor(norm.counts, traits$data.Late.vs.all, use = 'p')
late.gene.signf.corr.pvals <- corPvalueStudent(late.gene.signf.corr, nSamples)


colorOfColumn = substring(names(datKME), 4)

#selectModules = c("green", "turquoise", "blue", "grey60", "salmon")
selectModules = colorOfColumn[colorOfColumn !="grey"]

#par(mfrow = c(3, length(selectModules)/3))
  for (module in selectModules) {
      column = match(module, colorOfColumn)
      restModule = moduleColorsAutomatic == module
      verboseScatterplot(datKME[restModule, column], GS.late[restModule], xlab = paste("Module Membership", 
          module, "module"), ylab = "GS.stage_Late", main = paste("kME.", module, 
          "vs. GS"), col = module)
  }

turquoise module (Horvath cares not for the reported cor=). Instead, Look at the y-axis: genes that have high positive module membership tend to be highly positively correlated with the late interaction of Pc in lupin, whereas, genes with negative values in the module they have negative relations with the late interaction of Pc in lupin. Can select either; genes with high positive/negative GS or high kMe (hub genes)

lategs <- late.gene.signf.corr.pvals %>% 
  as.data.frame() %>%
  tibble::rownames_to_column("gene_id")

lategsid <- dplyr::right_join(Pcgenenames, lategs, by = 'gene_id')

datKme.id <- as.data.frame(datKME) %>%
  rownames_to_column("gene_id") 

lateeiginengene <- dplyr::right_join(lategsid, datKme.id, by = 'gene_id')
# chooseTopHubInEachModule returns the gene in each module with the highest connectivity, looking at all genes in the expression file

PcHubgenes <- chooseTopHubInEachModule(norm.counts,
                         colorh = module.gene.mapping,
                         omitColor = "grey",
                         power =  2,
                         type = "signed")
PcHubgenes 
##         black          blue         brown          cyan         green 
## "IUM83_03164" "IUM83_12926" "IUM83_06000" "IUM83_08664" "IUM83_06699" 
##   greenyellow     lightcyan       magenta  midnightblue          pink 
## "IUM83_02874" "IUM83_10668" "IUM83_04177" "IUM83_09489" "IUM83_14330" 
##        purple           red        salmon           tan     turquoise 
## "IUM83_02522" "IUM83_07133" "IUM83_02405" "IUM83_19502" "IUM83_15977" 
##        yellow 
## "IUM83_18642"
PcHubgenes.df <- PcHubgenes %>% 
  as.data.frame(row.names = NULL, stringAsFactors = FALSE) 

PcHubgenes.df <- PcHubgenes.df %>% 
       rename("gene_id" = ".")

PcHubgenes.df <- tibble::rownames_to_column(PcHubgenes.df, "module")


PcHubgenes.df <- dplyr::right_join(pc.annotations, PcHubgenes.df, by = 'gene_id')
PcHubgenes.df
##        gene_id                                                    product
## 1  IUM83_06000                                       hypothetical protein
## 2  IUM83_15977                                      heat shock protein 70
## 3  IUM83_07133            aldehyde dehydrogenase, mitochondrial precursor
## 4  IUM83_19502                                            Annexin protein
## 5  IUM83_09489                                RNAse P, Rpr2/Rpp21 subunit
## 6  IUM83_02405                                       hypothetical protein
## 7  IUM83_02522                                      putative ribonuclease
## 8  IUM83_08664                             snf7-domain-containing protein
## 9  IUM83_04177                                         PPR repeat protein
## 10 IUM83_03164                     putative centrosome-associated protein
## 11 IUM83_10668              putative dihydrolipoamide succinyltransferase
## 12 IUM83_06699                                       hypothetical protein
## 13 IUM83_12926                                       hypothetical protein
## 14 IUM83_14330 peroxisomal biogenesis factor 11 domain-containing protein
## 15 IUM83_18642                                                  C2 domain
## 16 IUM83_02874                   putative inorganic phosphate transporter
##      protein_id       module
## 1  KAG6583047.1        brown
## 2  KAG6580057.1    turquoise
## 3  KAG6574467.1          red
## 4  KAG6619070.1          tan
## 5  KAG6617901.1 midnightblue
## 6  KAG6617545.1       salmon
## 7  KAG6617217.1       purple
## 8  KAG6614960.1         cyan
## 9  KAG6614562.1      magenta
## 10 KAG6612551.1        black
## 11 KAG6612350.1    lightcyan
## 12 KAG6610530.1        green
## 13 KAG6608957.1         blue
## 14 KAG6606500.1         pink
## 15 KAG6596141.1       yellow
## 16 KAG6587228.1  greenyellow
# Heatmap of old module eigen-genes and samples
#pdf(file="oldMEs.pdf",heigh=80,width=20)

library("pheatmap")
library(RColorBrewer)

MEs <- bwnet$MEs
rownames(MEs)=names(norm.counts[,9])
#pheatmap(MEs,cluster_col=T,cluster_row=T,show_rownames=F,show_colnames=T,fontsize=6)

col_ann <- colData[,c(1,4)]
rownames(col_ann) <- col_ann$Sample
col_ann <- data.frame(col_ann)
col_ann$Stage <- as.factor(col_ann$Stage)
col_ann <- col_ann[order(col_ann$Stage),]
col_ann$sample_ID <- NULL
head(col_ann, 9)
##            Sample      Stage
## PcHyp_1   PcHyp_1 Vegetative
## PcHyp_2   PcHyp_2 Vegetative
## PcHyp_3   PcHyp_3 Vegetative
## LaPc18_1 LaPc18_1      Early
## LaPc18_2 LaPc18_2      Early
## LaPc18_3 LaPc18_3      Early
## LaPc30_1 LaPc30_1     Middle
## LaPc30_2 LaPc30_2     Middle
## LaPc30_3 LaPc30_3     Middle
ann_color <- list("col_ann" = c("In Planta" = "purple", 
                                "Early" = "yellow",
                                "Middle" = "green",
                                "Late" = "blue"))

data.me <- data.frame(MEs)
data.me <- data.me[order(match(rownames(data.me), rownames(col_ann))),]
#dim(data.me)

#pdf(file="newMEs.pdf",heigh=60,width=20)
rownames(MEs)=names(colData[ ,1])
pheatmap(data.me,cluster_col=T,cluster_row=F,show_rownames=F,
         show_colnames=T,fontsize=6,
         annotation_row = col_ann, annotation_colors = ann_color)

writeGMT <- function (object, fname ){
  if (class(object) != "list") stop("object should be of class 'list'")
  if(file.exists(fname)) unlink(fname)
  for (iElement in 1:length(object)){
    write.table(t(c(make.names(rep(names(object)[iElement],2)),object[[iElement]])),
                sep="\t",quote=FALSE,
                file=fname,append=TRUE,col.names=FALSE,row.names=FALSE)
  }
}

writeGMT(object=gul,fname="goterms.gmt")
genesets <- read.gmt("goterms.gmt")
bg <- gene.list
get_module_genes <- function(eigengene_df) {
  module_list <- c("red", "greenyellow", "purple", "lightcyan", "blue", "turquoise",
                   "green", "brown", "yellow", "pink", "tan", "magenta", "midnightblue",
                   "salmon", "black", "cyan")
  gene_list <- list()
  
  module_genes <- lapply(module_list, function(module) {
    eigengene_df$gene_id[eigengene_df$V1 > 0.2 & eigengene_df[[paste0("kME", module)]] > 0.7]
  })
  
  names(module_genes) <- module_list
  return(module_genes)
}

module_genes_list_early <- get_module_genes(earlyeiginengene)
module_genes_list_mid <- get_module_genes(mideiginengene)
module_genes_list_late <- get_module_genes(lateeiginengene)

get_module_genes_all <- function(eigengene_df, module_list) {
  gene_list <- list()
  
  module_genes <- lapply(module_list, function(module) {
    eigengene_df$gene_id[eigengene_df$V1 > 0.2 & eigengene_df[[paste0("kME", module)]] > 0.7]
  })
  
  names(module_genes) <- module_list
  return(module_genes)
}

module_list <- c("red", "greenyellow", "purple", "lightcyan", "blue", "turquoise",
                 "green", "brown", "yellow", "pink", "tan", "magenta", "midnightblue",
                 "salmon", "black", "cyan")

module_genes_list_early <- get_module_genes_all(earlyeiginengene, module_list)

results <- lapply(names(module_genes_list_early), function(module.color) {
  ora_res <- as.data.frame(enricher(gene = module_genes_list_early[[module.color]],
                                    universe = bg,
                                    maxGSSize = 5000,
                                    TERM2GENE = genesets,
                                    pAdjustMethod = "fdr",
                                    pvalueCutoff = 1,
                                    qvalueCutoff = 1))
  
  ora_res$geneID <- NULL
  ora_res <- subset(ora_res, (ora_res$p.adjust < 0.05 & ora_res$Count >= 5))
  ora_res_names <- rownames(ora_res)
  
  ora_res$GeneRatio <- as.character(ora_res$GeneRatio)
  
  gr <- as.numeric(sapply(strsplit(as.character(ora_res$GeneRatio), "/"), "[[", 1)) /
    as.numeric(sapply(strsplit(as.character(ora_res$GeneRatio), "/"), "[[", 2))
  
  br <- as.numeric(sapply(strsplit(as.character(ora_res$BgRatio), "/"), "[[", 1)) /
    as.numeric(sapply(strsplit(as.character(ora_res$BgRatio), "/"), "[[", 2))
  
  ora_res$es <- gr/br
  ora_res <- ora_res[order(-ora_res$es), ]
  ora_res$Description <- NULL
  
  return(ora_res)
})
## --> No gene can be mapped....
## --> Expected input gene ID: IUM83_17731,IUM83_10855,IUM83_12338,IUM83_14578,IUM83_05370,IUM83_00325
## --> return NULL...
## --> No gene can be mapped....
## --> Expected input gene ID: IUM83_12734,IUM83_10070,IUM83_04864,IUM83_13520,IUM83_10855,IUM83_03578
## --> return NULL...
## --> No gene can be mapped....
## --> Expected input gene ID: IUM83_10158,IUM83_04864,IUM83_10069,IUM83_13005,IUM83_19883,IUM83_02530
## --> return NULL...
## --> No gene can be mapped....
## --> Expected input gene ID: IUM83_10855,IUM83_13355,IUM83_19790,IUM83_18049,IUM83_04642,IUM83_09871
## --> return NULL...
## --> No gene can be mapped....
## --> Expected input gene ID: IUM83_13533,IUM83_12735,IUM83_10158,IUM83_18607,IUM83_12508,IUM83_19677
## --> return NULL...
## --> No gene can be mapped....
## --> Expected input gene ID: IUM83_14369,IUM83_09310,IUM83_04863,IUM83_13305,IUM83_00832,IUM83_12734
## --> return NULL...
names(results) <- names(module_genes_list_early)

all_ids <- unlist(lapply(results, function(module_results) {
  module_results$ID[module_results$es >= 3]
}))

unique_ids <- unique(all_ids)

print(unique_ids)
##  [1] "C.GO.0015934.C.large.ribosomal.subunit"                             
##  [2] "C.GO.0005576.C.extracellular.region"                                
##  [3] "F.GO.0003735.F.structural.constituent.of.ribosome"                  
##  [4] "C.GO.0005840.C.ribosome"                                            
##  [5] "P.GO.0006412.P.translation"                                         
##  [6] "F.GO.0004252.F.serine.type.endopeptidase.activity"                  
##  [7] "F.GO.0004553.F.hydrolase.activity..hydrolyzing.O.glycosyl.compounds"
##  [8] "F.GO.0140359.F.ABC.type.transporter.activity"                       
##  [9] "P.GO.0055085.P.transmembrane.transport"                             
## [10] "C.GO.0005852.C.eukaryotic.translation.initiation.factor.3.complex"  
## [11] "P.GO.0006364.P.rRNA.processing"                                     
## [12] "C.GO.0005730.C.nucleolus"                                           
## [13] "P.GO.0001522.P.pseudouridine.synthesis"                             
## [14] "P.GO.0042254.P.ribosome.biogenesis"                                 
## [15] "F.GO.0051082.F.unfolded.protein.binding"                            
## [16] "F.GO.0003746.F.translation.elongation.factor.activity"              
## [17] "F.GO.0003743.F.translation.initiation.factor.activity"              
## [18] "F.GO.0009982.F.pseudouridine.synthase.activity"                     
## [19] "P.GO.0006414.P.translational.elongation"                            
## [20] "F.GO.0051539.F.4.iron..4.sulfur.cluster.binding"                    
## [21] "P.GO.0006457.P.protein.folding"                                     
## [22] "F.GO.0003899.F.DNA.directed.5..3..RNA.polymerase.activity"          
## [23] "P.GO.0006413.P.translational.initiation"                            
## [24] "C.GO.0015935.C.small.ribosomal.subunit"                             
## [25] "F.GO.0017056.F.structural.constituent.of.nuclear.pore"
# Get all unique IDs across all color modules
unique_ids <- unique(unlist(lapply(results, function(x) x$ID)))

# Create empty matrix to hold values for heatmap
heatmap_matrix <- matrix(0, nrow = length(unique_ids), ncol = length(module_list), 
                         dimnames = list(unique_ids, module_list))

# Fill in matrix with es values where available
for (module_color in names(results)) {
  module_results <- results[[module_color]]
  module_genes <- module_results$ID
  module_es <- module_results$es
  for (i in seq_along(unique_ids)) {
    if (unique_ids[i] %in% module_genes) {
      heatmap_matrix[i, module_color] <- module_es[which(module_genes == unique_ids[i])]
    }
  }
}


eorahm <- pheatmap(heatmap_matrix, 
         cluster_rows = FALSE,
         cluster_cols = FALSE,
         color = colorRampPalette(c("white", "green"))(100),
         scale = "none",
         fontsize_col = 8,
         fontsize_row = 6,
        main = "Early Module Enrichment Score",
         filename = "pheatmap1.png")
get_module_genes <- function(eigengene_df) {
  module_list <- c("red", "greenyellow", "purple", "lightcyan", "blue", "turquoise",
                   "green", "brown", "yellow", "pink", "tan", "magenta", "midnightblue",
                   "salmon", "black", "cyan")
  gene_list <- list()
  
  module_genes <- lapply(module_list, function(module) {
    eigengene_df$gene_id[eigengene_df$V1 > 0.2 & eigengene_df[[paste0("kME", module)]] > 0.7]
  })
  
  names(module_genes) <- module_list
  return(module_genes)
}

module_genes_list_early <- get_module_genes(earlyeiginengene)
module_genes_list_mid <- get_module_genes(mideiginengene)
module_genes_list_late <- get_module_genes(lateeiginengene)

get_module_genes_all <- function(eigengene_df, module_list) {
  gene_list <- list()
  
  module_genes <- lapply(module_list, function(module) {
    eigengene_df$gene_id[eigengene_df$V1 > 0.2 & eigengene_df[[paste0("kME", module)]] > 0.7]
  })
  
  names(module_genes) <- module_list
  return(module_genes)
}

module_list <- c("red", "greenyellow", "purple", "lightcyan", "blue", "turquoise",
                 "green", "brown", "yellow", "pink", "tan", "magenta", "midnightblue",
                 "salmon", "black", "cyan")

module_genes_list_mid <- get_module_genes_all(mideiginengene, module_list)

results <- lapply(names(module_genes_list_mid), function(module.color) {
  ora_res <- as.data.frame(enricher(gene = module_genes_list_mid[[module.color]],
                                    universe = bg,
                                    maxGSSize = 5000,
                                    TERM2GENE = genesets,
                                    pAdjustMethod = "fdr",
                                    pvalueCutoff = 1,
                                    qvalueCutoff = 1))
  
  ora_res$geneID <- NULL
  ora_res <- subset(ora_res, (ora_res$p.adjust < 0.05 & ora_res$Count >= 5))
  ora_res_names <- rownames(ora_res)
  
  ora_res$GeneRatio <- as.character(ora_res$GeneRatio)
  
  gr <- as.numeric(sapply(strsplit(as.character(ora_res$GeneRatio), "/"), "[[", 1)) /
    as.numeric(sapply(strsplit(as.character(ora_res$GeneRatio), "/"), "[[", 2))
  
  br <- as.numeric(sapply(strsplit(as.character(ora_res$BgRatio), "/"), "[[", 1)) /
    as.numeric(sapply(strsplit(as.character(ora_res$BgRatio), "/"), "[[", 2))
  
  ora_res$es <- gr/br
  ora_res <- ora_res[order(-ora_res$es), ]
  ora_res$Description <- NULL
  
  return(ora_res)
})
## --> No gene can be mapped....
## --> Expected input gene ID: IUM83_00833,IUM83_08971,IUM83_01656,IUM83_00469,IUM83_03578,IUM83_01546
## --> return NULL...
## --> No gene can be mapped....
## --> Expected input gene ID: IUM83_11128,IUM83_11613,IUM83_03578,IUM83_12735,IUM83_16051,IUM83_04464
## --> return NULL...
## --> No gene can be mapped....
## --> Expected input gene ID: IUM83_19790,IUM83_19107,IUM83_00604,IUM83_09870,IUM83_03576,IUM83_18607
## --> return NULL...
names(results) <- names(module_genes_list_mid)

all_ids <- unlist(lapply(results, function(module_results) {
  module_results$ID[module_results$es >= 2]
}))

unique_ids <- unique(all_ids)

print(unique_ids)
##  [1] "P.GO.0006096.P.glycolytic.process"                                                                             
##  [2] "F.GO.0010181.F.FMN.binding"                                                                                    
##  [3] "C.GO.0005576.C.extracellular.region"                                                                           
##  [4] "P.GO.0006749.P.glutathione.metabolic.process"                                                                  
##  [5] "F.GO.0016620.F.oxidoreductase.activity..acting.on.the.aldehyde.or.oxo.group.of.donors..NAD.or.NADP.as.acceptor"
##  [6] "F.GO.0004497.F.monooxygenase.activity"                                                                         
##  [7] "F.GO.0140359.F.ABC.type.transporter.activity"                                                                  
##  [8] "F.GO.0004553.F.hydrolase.activity..hydrolyzing.O.glycosyl.compounds"                                           
##  [9] "F.GO.0051287.F.NAD.binding"                                                                                    
## [10] "F.GO.0004252.F.serine.type.endopeptidase.activity"                                                             
## [11] "F.GO.0030170.F.pyridoxal.phosphate.binding"                                                                    
## [12] "F.GO.0016491.F.oxidoreductase.activity"                                                                        
## [13] "P.GO.0005975.P.carbohydrate.metabolic.process"                                                                 
## [14] "F.GO.0022857.F.transmembrane.transporter.activity"                                                             
## [15] "P.GO.0055085.P.transmembrane.transport"                                                                        
## [16] "P.GO.0009058.P.biosynthetic.process"                                                                           
## [17] "P.GO.0006511.P.ubiquitin.dependent.protein.catabolic.process"                                                  
## [18] "C.GO.0005852.C.eukaryotic.translation.initiation.factor.3.complex"                                             
## [19] "F.GO.0003746.F.translation.elongation.factor.activity"                                                         
## [20] "C.GO.0005730.C.nucleolus"                                                                                      
## [21] "F.GO.0051082.F.unfolded.protein.binding"                                                                       
## [22] "F.GO.0009982.F.pseudouridine.synthase.activity"                                                                
## [23] "P.GO.0001522.P.pseudouridine.synthesis"                                                                        
## [24] "P.GO.0006414.P.translational.elongation"                                                                       
## [25] "F.GO.0051539.F.4.iron..4.sulfur.cluster.binding"                                                               
## [26] "F.GO.0003743.F.translation.initiation.factor.activity"                                                         
## [27] "P.GO.0006457.P.protein.folding"                                                                                
## [28] "P.GO.0006364.P.rRNA.processing"                                                                                
## [29] "P.GO.0042254.P.ribosome.biogenesis"                                                                            
## [30] "P.GO.0006413.P.translational.initiation"                                                                       
## [31] "F.GO.0003724.F.RNA.helicase.activity"                                                                          
## [32] "C.GO.0005739.C.mitochondrion"                                                                                  
## [33] "F.GO.0050661.F.NADP.binding"                                                                                   
## [34] "F.GO.0016887.F.ATP.hydrolysis.activity"                                                                        
## [35] "C.GO.0005743.C.mitochondrial.inner.membrane"
# Get all unique IDs across all color modules
unique_ids <- unique(unlist(lapply(results, function(x) x$ID)))

# Create empty matrix to hold values for heatmap
heatmap_matrix <- matrix(0, nrow = length(unique_ids), ncol = length(module_list), 
                         dimnames = list(unique_ids, module_list))

# Fill in matrix with es values where available
for (module_color in names(results)) {
  module_results <- results[[module_color]]
  module_genes <- module_results$ID
  module_es <- module_results$es
  for (i in seq_along(unique_ids)) {
    if (unique_ids[i] %in% module_genes) {
      heatmap_matrix[i, module_color] <- module_es[which(module_genes == unique_ids[i])]
    }
  }
}

morahm <-pheatmap(heatmap_matrix, 
         cluster_rows = FALSE,
         cluster_cols = FALSE,
         color = colorRampPalette(c("white", "blue"))(100),
         scale = "none",
         fontsize_col = 8,
         fontsize_row = 6,
         main = "Mid Module Enrichment Score",
         filename = "pheatmap2.png")
get_module_genes <- function(eigengene_df) {
  module_list <- c("red", "greenyellow", "purple", "lightcyan", "blue", "turquoise",
                   "green", "brown", "yellow", "pink", "tan", "magenta", "midnightblue",
                   "salmon", "black", "cyan")
  gene_list <- list()
  
  module_genes <- lapply(module_list, function(module) {
    eigengene_df$gene_id[eigengene_df$V1 > 0.2 & eigengene_df[[paste0("kME", module)]] > 0.7]
  })
  
  names(module_genes) <- module_list
  return(module_genes)
}

module_genes_list_early <- get_module_genes(earlyeiginengene)
module_genes_list_mid <- get_module_genes(mideiginengene)
module_genes_list_late <- get_module_genes(lateeiginengene)

get_module_genes_all <- function(eigengene_df, module_list) {
  gene_list <- list()
  
  module_genes <- lapply(module_list, function(module) {
    eigengene_df$gene_id[eigengene_df$V1 > 0.2 & eigengene_df[[paste0("kME", module)]] > 0.7]
  })
  
  names(module_genes) <- module_list
  return(module_genes)
}

module_list <- c("red", "greenyellow", "purple", "lightcyan", "blue", "turquoise",
                 "green", "brown", "yellow", "pink", "tan", "magenta", "midnightblue",
                 "salmon", "black", "cyan")

module_genes_list_late <- get_module_genes_all(lateeiginengene, module_list)

results <- lapply(names(module_genes_list_late), function(module.color) {
  ora_res <- as.data.frame(enricher(gene = module_genes_list_late[[module.color]],
                                    universe = bg,
                                    maxGSSize = 5000,
                                    TERM2GENE = genesets,
                                    pAdjustMethod = "fdr",
                                    pvalueCutoff = 1,
                                    qvalueCutoff = 1))
  
  ora_res$geneID <- NULL
  ora_res <- subset(ora_res, (ora_res$p.adjust < 0.05 & ora_res$Count >= 5))
  ora_res_names <- rownames(ora_res)
  
  ora_res$GeneRatio <- as.character(ora_res$GeneRatio)
  
  gr <- as.numeric(sapply(strsplit(as.character(ora_res$GeneRatio), "/"), "[[", 1)) /
    as.numeric(sapply(strsplit(as.character(ora_res$GeneRatio), "/"), "[[", 2))
  
  br <- as.numeric(sapply(strsplit(as.character(ora_res$BgRatio), "/"), "[[", 1)) /
    as.numeric(sapply(strsplit(as.character(ora_res$BgRatio), "/"), "[[", 2))
  
  ora_res$es <- gr/br
  ora_res <- ora_res[order(-ora_res$es), ]
  ora_res$Description <- NULL
  
  return(ora_res)
})

names(results) <- names(module_genes_list_late)

all_ids <- unlist(lapply(results, function(module_results) {
  module_results$ID[module_results$es >= 3]
}))

unique_ids <- unique(all_ids)

print(unique_ids)
##  [1] "C.GO.0015934.C.large.ribosomal.subunit"                             
##  [2] "P.GO.0006096.P.glycolytic.process"                                  
##  [3] "F.GO.0004497.F.monooxygenase.activity"                              
##  [4] "C.GO.0005576.C.extracellular.region"                                
##  [5] "F.GO.0003735.F.structural.constituent.of.ribosome"                  
##  [6] "F.GO.0004553.F.hydrolase.activity..hydrolyzing.O.glycosyl.compounds"
##  [7] "F.GO.0140359.F.ABC.type.transporter.activity"                       
##  [8] "F.GO.0030170.F.pyridoxal.phosphate.binding"                         
##  [9] "F.GO.0016491.F.oxidoreductase.activity"                             
## [10] "F.GO.0051287.F.NAD.binding"                                         
## [11] "C.GO.0005852.C.eukaryotic.translation.initiation.factor.3.complex"  
## [12] "F.GO.0003746.F.translation.elongation.factor.activity"              
## [13] "C.GO.0005730.C.nucleolus"                                           
## [14] "P.GO.0006414.P.translational.elongation"                            
## [15] "P.GO.0006364.P.rRNA.processing"                                     
## [16] "P.GO.0042254.P.ribosome.biogenesis"                                 
## [17] "F.GO.0003743.F.translation.initiation.factor.activity"              
## [18] "F.GO.0051082.F.unfolded.protein.binding"                            
## [19] "P.GO.0006457.P.protein.folding"                                     
## [20] "P.GO.0006412.P.translation"                                         
## [21] "C.GO.0005840.C.ribosome"                                            
## [22] "C.GO.0015935.C.small.ribosomal.subunit"                             
## [23] "P.GO.0016567.P.protein.ubiquitination"                              
## [24] "P.GO.0071704.P.organic.substance.metabolic.process"                 
## [25] "F.GO.0004842.F.ubiquitin.protein.transferase.activity"              
## [26] "P.GO.0007018.P.microtubule.based.movement"
# Get all unique IDs across all color modules
unique_ids <- unique(unlist(lapply(results, function(x) x$ID)))

# Create empty matrix to hold values for heatmap
heatmap_matrix <- matrix(0, nrow = length(unique_ids), ncol = length(module_list), 
                         dimnames = list(unique_ids, module_list))

# Fill in matrix with es values where available
for (module_color in names(results)) {
  module_results <- results[[module_color]]
  module_genes <- module_results$ID
  module_es <- module_results$es
  for (i in seq_along(unique_ids)) {
    if (unique_ids[i] %in% module_genes) {
      heatmap_matrix[i, module_color] <- module_es[which(module_genes == unique_ids[i])]
    }
  }
}

late <-pheatmap(heatmap_matrix, 
         cluster_rows = FALSE,
         cluster_cols = FALSE,
         color = colorRampPalette(c("white", "red"))(100),
         scale = "none",
         fontsize_col = 8,
         fontsize_row = 6,
         main = "Late Module Enrichment Score", 
         filename = "pheatmap3.png")


hml <- as.data.frame.matrix(heatmap_matrix)
module_genes_list_late <- get_module_genes_all(lateeiginengene, module_list)

results <- lapply(names(module_genes_list_late), function(module.color) {
  ora_res <- as.data.frame(enricher(gene = module_genes_list_late[[module.color]],
                                    universe = bg,
                                    maxGSSize = 5000,
                                    TERM2GENE = genesets,
                                    pAdjustMethod = "fdr",
                                    pvalueCutoff = 1,
                                    qvalueCutoff = 1))
  
  ora_res$geneID <- NULL
  ora_res <- subset(ora_res, (ora_res$p.adjust < 0.05 & ora_res$Count >= 5))
  ora_res_names <- rownames(ora_res)
  
  ora_res$GeneRatio <- as.character(ora_res$GeneRatio)
  
  gr <- as.numeric(sapply(strsplit(as.character(ora_res$GeneRatio), "/"), "[[", 1)) /
    as.numeric(sapply(strsplit(as.character(ora_res$GeneRatio), "/"), "[[", 2))
  
  br <- as.numeric(sapply(strsplit(as.character(ora_res$BgRatio), "/"), "[[", 1)) /
    as.numeric(sapply(strsplit(as.character(ora_res$BgRatio), "/"), "[[", 2))
  
  ora_res$es <- gr/br
  ora_res <- ora_res[order(-ora_res$es), ]
  ora_res$Description <- NULL
  
  return(ora_res)
})

names(results) <- names(module_genes_list_late)

all_ids <- unlist(lapply(results, function(module_results) {
  module_results$ID[module_results$es >= 3]
}))

unique_ids <- unique(all_ids)

print(unique_ids)
##  [1] "C.GO.0015934.C.large.ribosomal.subunit"                             
##  [2] "P.GO.0006096.P.glycolytic.process"                                  
##  [3] "F.GO.0004497.F.monooxygenase.activity"                              
##  [4] "C.GO.0005576.C.extracellular.region"                                
##  [5] "F.GO.0003735.F.structural.constituent.of.ribosome"                  
##  [6] "F.GO.0004553.F.hydrolase.activity..hydrolyzing.O.glycosyl.compounds"
##  [7] "F.GO.0140359.F.ABC.type.transporter.activity"                       
##  [8] "F.GO.0030170.F.pyridoxal.phosphate.binding"                         
##  [9] "F.GO.0016491.F.oxidoreductase.activity"                             
## [10] "F.GO.0051287.F.NAD.binding"                                         
## [11] "C.GO.0005852.C.eukaryotic.translation.initiation.factor.3.complex"  
## [12] "F.GO.0003746.F.translation.elongation.factor.activity"              
## [13] "C.GO.0005730.C.nucleolus"                                           
## [14] "P.GO.0006414.P.translational.elongation"                            
## [15] "P.GO.0006364.P.rRNA.processing"                                     
## [16] "P.GO.0042254.P.ribosome.biogenesis"                                 
## [17] "F.GO.0003743.F.translation.initiation.factor.activity"              
## [18] "F.GO.0051082.F.unfolded.protein.binding"                            
## [19] "P.GO.0006457.P.protein.folding"                                     
## [20] "P.GO.0006412.P.translation"                                         
## [21] "C.GO.0005840.C.ribosome"                                            
## [22] "C.GO.0015935.C.small.ribosomal.subunit"                             
## [23] "P.GO.0016567.P.protein.ubiquitination"                              
## [24] "P.GO.0071704.P.organic.substance.metabolic.process"                 
## [25] "F.GO.0004842.F.ubiquitin.protein.transferase.activity"              
## [26] "P.GO.0007018.P.microtubule.based.movement"
# Get all unique IDs across all color modules
unique_ids <- unique(unlist(lapply(results, function(x) x$ID)))

# Create empty matrix to hold values for heatmap
heatmap_matrix <- matrix(0, nrow = length(unique_ids), ncol = length(module_list), 
                         dimnames = list(unique_ids, module_list))

# Fill in matrix with es values where available
for (module_color in names(results)) {
  module_results <- results[[module_color]]
  module_genes <- module_results$ID
  module_es <- module_results$es
  for (i in seq_along(unique_ids)) {
    if (unique_ids[i] %in% module_genes) {
      heatmap_matrix[i, module_color] <- module_es[which(module_genes == unique_ids[i])]
    }
  }
}

late <- pheatmap(heatmap_matrix, 
         cluster_rows = FALSE,
         cluster_cols = FALSE,
         color = colorRampPalette(c("white", "red"))(100),
         scale = "none",
         fontsize_col = 8,
         fontsize_row = 6,
         main = "Late Module Enrichment Score", 
         filename = "pheatmap4.png")

## R version 4.3.0 (2023-04-21)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 22.04.2 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.10.0 
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.10.0
## 
## locale:
##  [1] LC_CTYPE=en_AU.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_AU.UTF-8        LC_COLLATE=en_AU.UTF-8    
##  [5] LC_MONETARY=en_AU.UTF-8    LC_MESSAGES=en_AU.UTF-8   
##  [7] LC_PAPER=en_AU.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_AU.UTF-8 LC_IDENTIFICATION=C       
## 
## time zone: Australia/Melbourne
## tzcode source: system (glibc)
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] RColorBrewer_1.1-3          pheatmap_1.0.12            
##  [3] clusterProfiler_4.8.1       edgeR_3.42.2               
##  [5] limma_3.56.0                rtracklayer_1.60.0         
##  [7] reshape2_1.4.4              gridExtra_2.3              
##  [9] lubridate_1.9.2             forcats_1.0.0              
## [11] stringr_1.5.0               dplyr_1.1.2                
## [13] purrr_1.0.1                 readr_2.1.4                
## [15] tidyr_1.3.0                 tibble_3.2.1               
## [17] ggplot2_3.4.2               tidyverse_2.0.0            
## [19] GEOquery_2.68.0             DESeq2_1.40.1              
## [21] SummarizedExperiment_1.30.1 Biobase_2.60.0             
## [23] MatrixGenerics_1.12.0       matrixStats_0.63.0         
## [25] GenomicRanges_1.52.0        GenomeInfoDb_1.36.0        
## [27] IRanges_2.34.0              S4Vectors_0.38.1           
## [29] BiocGenerics_0.46.0         WGCNA_1.72-1               
## [31] fastcluster_1.2.3           dynamicTreeCut_1.63-1      
## [33] kableExtra_1.3.4           
## 
## loaded via a namespace (and not attached):
##   [1] splines_4.3.0            BiocIO_1.10.0            bitops_1.0-7            
##   [4] ggplotify_0.1.0          polyclip_1.10-4          preprocessCore_1.62.1   
##   [7] XML_3.99-0.14            rpart_4.1.19             lifecycle_1.0.3         
##  [10] doParallel_1.0.17        lattice_0.21-8           MASS_7.3-60             
##  [13] backports_1.4.1          magrittr_2.0.3           Hmisc_5.0-1             
##  [16] sass_0.4.5               rmarkdown_2.21           jquerylib_0.1.4         
##  [19] yaml_2.3.7               cowplot_1.1.1            DBI_1.1.3               
##  [22] zlibbioc_1.46.0          rvest_1.0.3              ggraph_2.1.0            
##  [25] RCurl_1.98-1.12          yulab.utils_0.0.6        nnet_7.3-18             
##  [28] tweenr_2.0.2             GenomeInfoDbData_1.2.10  enrichplot_1.20.0       
##  [31] ggrepel_0.9.3            tidytree_0.4.2           svglite_2.1.1           
##  [34] codetools_0.2-19         DelayedArray_0.26.1      DOSE_3.26.1             
##  [37] xml2_1.3.4               ggforce_0.4.1            tidyselect_1.2.0        
##  [40] aplot_0.1.10             farver_2.1.1             viridis_0.6.2           
##  [43] base64enc_0.1-3          webshot_0.5.4            GenomicAlignments_1.36.0
##  [46] jsonlite_1.8.4           tidygraph_1.2.3          Formula_1.2-5           
##  [49] survival_3.5-5           iterators_1.0.14         systemfonts_1.0.4       
##  [52] foreach_1.5.2            tools_4.3.0              treeio_1.24.0           
##  [55] Rcpp_1.0.10              glue_1.6.2               xfun_0.39               
##  [58] qvalue_2.32.0            withr_2.5.0              fastmap_1.1.1           
##  [61] fansi_1.0.4              digest_0.6.31            timechange_0.2.0        
##  [64] R6_2.5.1                 gridGraphics_0.5-1       colorspace_2.1-0        
##  [67] GO.db_3.17.0             RSQLite_2.3.1            utf8_1.2.3              
##  [70] generics_0.1.3           data.table_1.14.8        graphlayouts_0.8.4      
##  [73] httr_1.4.5               htmlwidgets_1.6.2        S4Arrays_1.0.1          
##  [76] scatterpie_0.1.9         pkgconfig_2.0.3          gtable_0.3.3            
##  [79] blob_1.2.4               impute_1.74.1            XVector_0.40.0          
##  [82] shadowtext_0.1.2         htmltools_0.5.5          fgsea_1.26.0            
##  [85] scales_1.2.1             png_0.1-8                ggfun_0.0.9             
##  [88] knitr_1.42               rstudioapi_0.14          tzdb_0.3.0              
##  [91] rjson_0.2.21             nlme_3.1-162             checkmate_2.2.0         
##  [94] cachem_1.0.7             parallel_4.3.0           HDO.db_0.99.1           
##  [97] foreign_0.8-84           AnnotationDbi_1.62.1     restfulr_0.0.15         
## [100] pillar_1.9.0             grid_4.3.0               vctrs_0.6.2             
## [103] cluster_2.1.4            htmlTable_2.4.1          evaluate_0.20           
## [106] cli_3.6.1                locfit_1.5-9.7           compiler_4.3.0          
## [109] Rsamtools_2.16.0         rlang_1.1.1              crayon_1.5.2            
## [112] labeling_0.4.2           plyr_1.8.8               stringi_1.7.12          
## [115] viridisLite_0.4.1        BiocParallel_1.34.0      munsell_0.5.0           
## [118] Biostrings_2.68.0        lazyeval_0.2.2           GOSemSim_2.26.0         
## [121] Matrix_1.5-4             patchwork_1.1.2          hms_1.1.3               
## [124] bit64_4.0.5              KEGGREST_1.40.0          highr_0.10              
## [127] igraph_1.4.2             memoise_2.0.1            bslib_0.4.2             
## [130] ggtree_3.8.0             fastmatch_1.1-3          bit_4.0.5               
## [133] downloader_0.4           gson_0.1.0               ape_5.7-1